Job Board User Analysis¶

Author: Madison Laprise¶

[A data analysis project conducted utilizing 7.5 million rows of user data from a popular online job board]

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import datetime, timedelta
import plotly.io as pio
pio.renderers.default = "notebook"
df = pd.read_csv("member_role_state_data_project.csv", parse_dates=["subscribe_date", "upgrade_date"])

Part 1: Most Common Role in Each State¶

In [2]:
# Format state codes to Plotly format ("XX" capital letters, no white space)
df["state"] = df["state"].str.strip().str.upper()

# Group by state and role, count how many members for each combo
role_counts = df.groupby(["state", "member_role"]).size().reset_index(name="count")

# For each state, keep only the role with the max count
top_roles = role_counts.loc[role_counts.groupby("state")["count"].idxmax()].reset_index(drop=True)

# Add dummy rows for job roles that aren't maximums in each state with fake identifiers, so they still show up in legend
## This is a personal touch, I thought it was more helpful to see what roles were being unrepresented
dummy_states = [f"DummyState{i}" for i in range(len(top_roles["state"].unique()))]
dummy_mapping = dict(zip(top_roles["state"].unique(), dummy_states))
top_roles["dummy_state"] = top_roles["state"].map(dummy_mapping)
all_roles = df["member_role"].unique()
for dummy_state in dummy_states:
    for role in all_roles:
        if not ((top_roles["dummy_state"] == dummy_state) & (top_roles["member_role"] == role)).any():
            top_roles = pd.concat([top_roles, pd.DataFrame({"dummy_state": [dummy_state], "member_role": [role], "count": [0]})], ignore_index=True)


# Plot
fig = px.choropleth(
    top_roles,
    locations="state",
    locationmode="USA-states",
    color="member_role",
    scope="usa",
    title="Most Common Job Role Among Site Members, by State",
    color_discrete_sequence=px.colors.qualitative.Dark24, # Although the 24 palettes are not my favorite, they have enough colors to cover all the roles
    hover_data={
        "state": True,
        "member_role": True,
        "count": True
    },
    labels={
        "state": "State",
        "member_role": "Job Role",
        "count": "Member Count"
    },
    height=600,
    width=1000,
)

fig.show()

Part 2: Member Indicies¶

Part 2a: Technology Index¶

In [3]:
# Filter for members in the "Technology" role
tech_df = df[df["member_role"] == "Technology"].copy()

# Count total members and tech members by state
total_by_state = df["state"].value_counts()
tech_by_state = tech_df["state"].value_counts()

# Build DataFrame with index values
tech_index_df = pd.DataFrame({
    "tech_pct": tech_by_state / total_by_state,
    "total_members": total_by_state
})

# Calculate national % of tech roles
national_pct = len(tech_df) / len(df)

# Index: state % divided by national %
tech_index_df["tech_index"] = tech_index_df["tech_pct"] / national_pct

# Reset index for Plotly
tech_index_df = tech_index_df.reset_index().rename(columns={"index": "state"})

# Plot
fig = px.choropleth(
    tech_index_df,
    locations="state",
    locationmode="USA-states",
    color="tech_index",
    scope="usa",
    title="Technology Member Index, by State",
    subtitle="Tech Index: How each state's share of Technology members compares to the national average (1.0 = average)",
    color_continuous_scale="rdbu",
    hover_data={
        "state": True,
        "tech_index": True,
        "total_members": True
    },
    labels={
        "state": "State",
        "tech_index": "Tech Index",
        "total_members": "Total Members"
    },
    height=600,
    width=1000,
)

fig.show()

Part 2b: Product & Program Management Index¶

In [4]:
# Filter for members in the "Technology" role
PM_df = df[df["member_role"] == "Project & Program Management"].copy()

# Count PM by state
PM_by_state = PM_df["state"].value_counts()

# Build DataFrame with index values
PM_index_df = pd.DataFrame({
    "PM_pct": PM_by_state / total_by_state,
    "total_members": total_by_state
})

# Calculate national % of tech roles
national_pct = len(PM_df) / len(df)

# Index: state % divided by national %
PM_index_df["PM_index"] = PM_index_df["PM_pct"] / national_pct

# Reset index for Plotly
PM_index_df = PM_index_df.reset_index().rename(columns={"index": "state"})

# Plot
fig = px.choropleth(
    PM_index_df,
    locations="state",
    locationmode="USA-states",
    color="PM_index",
    scope="usa",
    title="Project & Program Management Member Index, by State",
    subtitle="PM Index: How each state's share of Project & Program Management members compares to the national average (1.0 = average)",
    color_continuous_scale="rdbu",
    hover_data={
        "state": True,
        "PM_index": True,
        "total_members": True
    },
    labels={
        "state": "State",
        "PM_index": "PM Index",
        "total_members": "Total Members"
    },
    height=600,
    width=1000,
)

fig.show()

Part 3: Premium Conversion Investigation¶

"In the last 6-months, which state and role has the highest rate of premium conversion within 24 hours of subscribing?"¶

In [5]:
# Filter dataset to only include past 6 months
six_months_ago = datetime.now() - timedelta(days=180)
df_recent = df[df["subscribe_date"] > six_months_ago]

# Eliminate members who have not upgraded
df_recent = df_recent.dropna(subset=["upgrade_date"])

# Find members who upgraded within 24 hours of subscribing
df_recent["upgraded_within_24h"] = df_recent["upgrade_date"] - df_recent["subscribe_date"] <= pd.Timedelta(hours=24)

# Count by state
upgraded_within_24h_by_state = df_recent.groupby("state")["upgraded_within_24h"].sum().sort_values(ascending=False)
# Count by role
upgraded_within_24h_by_role = df_recent.groupby("member_role")["upgraded_within_24h"].sum().sort_values(ascending=False)

#### Report and Visualize Results ####

### STATE
print("****\nSTATE")
print("State with highest rate of 24-hour upgrades (past 6 months):")
print(f"{upgraded_within_24h_by_state.index[0]} -- {upgraded_within_24h_by_state.max()} total upgrades within 24 hours")

# Convert Series to DataFrame for Plotly
upgraded_within_24h_by_state_df = upgraded_within_24h_by_state.reset_index()
upgraded_within_24h_by_state_df.columns = ["state", "upgraded_within_24h"]

# Choropleth for state
fig_state = px.choropleth(
    upgraded_within_24h_by_state_df,
    locations="state",
    locationmode="USA-states",
    color="upgraded_within_24h",
    scope="usa",
    title="24-hour upgrades by State (past 6 months)",
    color_continuous_scale="blues",
    hover_data={
        "state": True,
        "upgraded_within_24h": True
    },
    labels={
        "state": "State",
        "upgraded_within_24h": "Total 24-hour upgrades"
    },
    height=600,
    width=1000,
)
fig_state.show()

### ROLE
print("****\nROLE")
print("Role with highest rate of 24-hour upgrades (past 6 months):")
print(f"{upgraded_within_24h_by_role.index[0]} -- {upgraded_within_24h_by_role.max()} total upgrades within 24 hours")
# Bar chart for role
fig_bar = px.bar(
    upgraded_within_24h_by_role,
    x=upgraded_within_24h_by_role.index,
    y=upgraded_within_24h_by_role.values,
    title="24-hour upgrades by Role (past 6 months)",
    labels={
        "x": "Role",
        "y": "Total 24-hour upgrades"
    },
    color=upgraded_within_24h_by_role.values,
    color_continuous_scale="blues",
    color_continuous_midpoint=0,
    height=600,
    width=1000,
)
fig_bar.update_layout(
    xaxis_title="Role",
    yaxis_title="Total 24-hour upgrades",
    coloraxis_showscale=False  # Remove color bar for clarity
)
fig_bar.show()
****
STATE
State with highest rate of 24-hour upgrades (past 6 months):
CA -- 304 total upgrades within 24 hours
****
ROLE
Role with highest rate of 24-hour upgrades (past 6 months):
Operations & General Management -- 505 total upgrades within 24 hours